Wikipedia OECD 웹사이트에서 먼저 OECD 국가명을 추출한다.
library(tidyverse)
library(rvest)
oecd_countries <- read_html("https://en.wikipedia.org/wiki/OECD") %>%
html_nodes(xpath = '//*[@id="mw-content-text"]/div[1]/table[5]') %>%
html_table() %>%
.[[1]]
oecd_df <- oecd_countries %>%
as_tibble() %>%
janitor::clean_names() %>%
# mutate(id = row_number()) %>%
select(content=country, start=membership_1, group = geographic_location) %>%
mutate(start = str_extract(start, "[0-9]{1,2}\\s[a-zA-Z]+\\s[0-9]{4}")) %>%
mutate(start = lubridate::dmy(start))
oecd_df# A tibble: 37 x 3
content start group
<chr> <date> <chr>
1 Australia 1971-06-07 Oceania
2 Austria 1961-09-29 Europe
3 Belgium 1961-09-13 Europe
4 Canada 1961-04-10 North America
5 Chile 2010-05-07 South America
6 Colombia 2020-04-28 South America
7 Czech Republic 1995-12-21 Europe
8 Denmark 1961-05-30 Europe
9 Estonia 2010-12-09 Europe
10 Finland 1969-01-28 Europe
# ... with 27 more rows
OECD 가입순서대로 대륙별로 타임라인을 잡아보자.
library(timevis)
location <- oecd_df %>%
count(group) %>%
select(id = group) %>%
mutate(content = id)
timevis(oecd_df, groups = location, options = list(stack = FALSE)) %>%
setOptions(list(editable = TRUE)) %>%
setSelection("South Korea") %>%
fitWindow(list(animation = FALSE))oecd_fact <- read_html("https://en.wikipedia.org/wiki/OECD") %>%
html_nodes(xpath = '//*[@id="mw-content-text"]/div[1]/table[6]') %>%
html_table() %>%
.[[1]] %>%
as_tibble() %>%
janitor::clean_names()
oecd_fact_df <- oecd_fact %>%
set_names(c("country", "area", "population", "gdp",
"gdp_per_capita", "income_inequality",
"hdi", "fsi", "rli", "cpi", "ief",
"gpi", "wpfi", "di")) %>%
mutate(income_inequality = ifelse(income_inequality == "N/A", NA, income_inequality),
fsi = ifelse(fsi == "N/A", NA, fsi),
rli = ifelse(rli == "N/A", NA, rli),
gpi = ifelse(gpi == "N/A", NA, gpi)) %>%
select(-rli) %>% ## 결측값이 8개국
mutate(income_inequality = ifelse(country == "New Zealand", 33, income_inequality),
income_inequality = ifelse(country == "Poland", 31.8, income_inequality),
fsi = ifelse(country == "Israel", 75.1, fsi)) %>%
filter(!str_detect(country, "Luxembourg|OECD|Country")) %>%
mutate(across(.cols=area:di, parse_number))
## 변수 결측값
sapply(oecd_fact_df, function(y) sum(length(which(is.na(y))))) %>%
as.data.frame() %>%
rownames_to_column(var="country") %>%
as_tibble() %>%
set_names(c("country", "missings")) %>%
arrange(desc(missings))# A tibble: 13 x 2
country missings
<chr> <int>
1 country 0
2 area 0
3 population 0
4 gdp 0
5 gdp_per_capita 0
6 income_inequality 0
7 hdi 0
8 fsi 0
9 cpi 0
10 ief 0
11 gpi 0
12 wpfi 0
13 di 0
## 관측점 국가별 결측값
rowSums(is.na(oecd_fact_df)) %>%
as_tibble() %>%
bind_cols(oecd_fact_df %>% select(country)) %>%
rename(missings = value) %>%
arrange(desc(missings))# A tibble: 36 x 2
missings country
<dbl> <chr>
1 0 Australia
2 0 Austria
3 0 Belgium
4 0 Canada
5 0 Chile
6 0 Colombia
7 0 Czech Republic
8 0 Denmark
9 0 Estonia
10 0 Finland
# ... with 26 more rows
정제된 데이터를 경제적인 측정지수 외에 다양한 지수를 바탕으로 일별해보자.
oecd_fact_df %>%
arrange(desc(gdp)) %>%
DT::datatable() %>%
DT::formatRound(c("area", "population", "gdp", "gdp_per_capita"), digits = 0, interval = 3)